Practical on the use of clustering¶
Part 2: Partition-based clustering¶
In this part of the practical, you will apply partition-based clustering (specifically, algorithm K-Means) to aggregated data consisting of weekly counts of individual photo-taking events in 5 regions in the North America. The regions are labelled as NE (North-East), SE (South-East), NW (North-West), SW (South-West), and GL (Great Lakes).
conda install -c conda-forge cufflinks-py
# Importing Libraries
import os
%env OMP_NUM_THREADS=3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (9.0, 3.0)
#import seaborn as sns
import plotly.graph_objs as go
#import chart_studio.plotly as py
#import cufflinks
#pd.options.display.max_columns = 30
#from IPython.core.interactiveshell import InteractiveShell
#import chart_studio.plotly.figure_factory as ff
#InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
#cufflinks.go_offline()
#cufflinks.set_config_file(world_readable=True, theme='pearl')
from sklearn.manifold import MDS
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
np.random.seed(123)
env: OMP_NUM_THREADS=3
A function for generating colours according to positions in a 2D projection¶
The following function getColor(...) will be used for a meaningful assignment of colours to clusters. The idea is to represent clusters by points in an artificial 2D space by applying a data embedding method (such as MDS - non-linear multidimensional scaling) to aggregate characteristics of clusters and generate colours for the clusters based on the positions of the corresponding points.
import math
def getColor (x, y, minX, maxX, minY, maxY):
wX=maxX-minX
wY=maxY-minY
rr=y-minY
cc=x-minX
#print(x,y)
if (wY < wX): #scale vertically, i.e. modify rr
rr *= wX/wY
else: #scale horizontally, i.e. modify cc
cc *= wY/wX
maxD=max(wX,wY)
rr1=maxD-rr
cc1=maxD-cc
#print(rr,cc,maxD,rr1,cc1)
dc=[math.sqrt(rr*rr+cc*cc),math.sqrt(rr*rr+cc1*cc1),math.sqrt(rr1*rr1+cc*cc),math.sqrt(rr1*rr1+cc1*cc1)]
weights=[0.0,0.0,0.0,0.0]
for i in range(len(weights)):
weights[i]=(maxD-dc[i])/maxD
if (weights[i]<0):
weights[i]=0
#print(dc,weights)
reds=[228,25,255,37]
greens=[220,228,18,13]
blues=[0,218,6,252]
dr=0
dg=0
db=0
for i,weight in enumerate(weights):
dr += weight*reds[i]
dg += weight*greens[i]
db += weight*blues[i]
if (dr<0):
dr=0;
if (dr>255):
dr=255
if (dg<0):
dg=0;
if (dg>255):
dg=255
if (db<0):
db=0;
if (db>255):
db=255
#print(weights,dr,dg,db)
c_string = '#{:02x}{:02x}{:02x}'.format(int(dr),int(dg),int(db))
return c_string
Load the data¶
The data are weekly counts of photo taking events by 5 regions denoted NW, SW, NE, SE, GL.
# Loading csv file into Pandas DataFrame
wd = pd.read_csv('weekly_counts_by_regions.csv')
wd.head(5)
| id | Name | date | year | week of year | NW | SW | NE | GL | SE | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01/01/2007 | 01/01/2007 | 01/01/2007 | 2007 | 1 | 0 | 0 | 1 | 0 | 1 |
| 1 | 08/01/2007 | 08/01/2007 | 08/01/2007 | 2007 | 2 | 0 | 0 | 1 | 0 | 1 |
| 2 | 15/01/2007 | 15/01/2007 | 15/01/2007 | 2007 | 3 | 0 | 0 | 0 | 0 | 0 |
| 3 | 22/01/2007 | 22/01/2007 | 22/01/2007 | 2007 | 4 | 2 | 0 | 0 | 0 | 1 |
| 4 | 29/01/2007 | 29/01/2007 | 29/01/2007 | 2007 | 5 | 0 | 1 | 1 | 1 | 0 |
wd.index=wd['id']
wd.head(5)
| id | Name | date | year | week of year | NW | SW | NE | GL | SE | |
|---|---|---|---|---|---|---|---|---|---|---|
| id | ||||||||||
| 01/01/2007 | 01/01/2007 | 01/01/2007 | 01/01/2007 | 2007 | 1 | 0 | 0 | 1 | 0 | 1 |
| 08/01/2007 | 08/01/2007 | 08/01/2007 | 08/01/2007 | 2007 | 2 | 0 | 0 | 1 | 0 | 1 |
| 15/01/2007 | 15/01/2007 | 15/01/2007 | 15/01/2007 | 2007 | 3 | 0 | 0 | 0 | 0 | 0 |
| 22/01/2007 | 22/01/2007 | 22/01/2007 | 22/01/2007 | 2007 | 4 | 2 | 0 | 0 | 0 | 1 |
| 29/01/2007 | 29/01/2007 | 29/01/2007 | 29/01/2007 | 2007 | 5 | 0 | 1 | 1 | 1 | 0 |
Run the clustering algorithm¶
The clustering algorithm K-Means has one parameter K, which is the number of clusters to produce. You are supposed to experiment with different values of this parameter.
To explore how the method works, we recommend this tutorial: https://k-means-explorable.vercel.app/
counts_columns=['NW','SW','NE','SE','GL']
wd_counts=wd.loc[:,counts_columns]
scaler = MinMaxScaler()
wd_counts_scaled=scaler.fit_transform(wd_counts)
# Use the scaled data already created earlier
data = wd_counts_scaled
from sklearn.metrics import silhouette_score
maxrange = 16 # K will run from 2..15
Ks = list(range(2, maxrange))
inertia_values = []
silhouette_values = []
explained_variance_values = []
# Total sum of squares (SST): sum of squared distances to the global mean
total_ss = np.sum((data - data.mean(axis=0))**2)
for k in Ks:
km = KMeans(n_clusters=k, random_state=10, n_init=10)
km.fit(data)
# SSE (inertia)
inertia = km.inertia_
inertia_values.append(inertia)
# Silhouette
labels = km.labels_
silhouette_values.append(silhouette_score(data, labels))
# % variance explained by clustering = 1 - SSE/SST
explained_variance_values.append(1 - inertia / total_ss)
# Plots
plt.figure(figsize=(6, 4))
plt.plot(Ks, inertia_values, marker='o')
plt.xticks(Ks)
plt.xlabel('Number of clusters K')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Curve (SSE)')
plt.grid(True)
plt.show()
plt.figure(figsize=(6, 4))
plt.plot(Ks, np.array(explained_variance_values) * 100, marker='o', color='tab:green')
plt.xticks(Ks)
plt.xlabel('Number of clusters K')
plt.ylabel('% Variance Explained')
plt.title('Elbow Curve (% Variance Explained)')
plt.grid(True)
plt.show()
plt.figure(figsize=(6, 4))
plt.plot(Ks, silhouette_values, marker='o', color='tab:orange')
plt.xticks(Ks)
plt.xlabel('Number of clusters K')
plt.ylabel('Average Silhouette Score')
plt.title('Average Silhouette Scores')
plt.grid(True)
plt.show()
# Optional summary table
pd.DataFrame({
'K': Ks,
'SSE': inertia_values,
'% Variance Explained': np.array(explained_variance_values) * 100,
'Silhouette': silhouette_values
})
| K | SSE | % Variance Explained | Silhouette | |
|---|---|---|---|---|
| 0 | 2 | 19.129661 | 40.290347 | 0.781282 |
| 1 | 3 | 15.353737 | 52.076186 | 0.791081 |
| 2 | 4 | 12.799675 | 60.048212 | 0.662569 |
| 3 | 5 | 10.411171 | 67.503479 | 0.669665 |
| 4 | 6 | 8.776413 | 72.606071 | 0.674079 |
| 5 | 7 | 7.376995 | 76.974090 | 0.663025 |
| 6 | 8 | 6.470048 | 79.804958 | 0.679490 |
| 7 | 9 | 5.783218 | 81.948768 | 0.682717 |
| 8 | 10 | 5.412990 | 83.104367 | 0.684049 |
| 9 | 11 | 5.052661 | 84.229064 | 0.666637 |
| 10 | 12 | 4.623862 | 85.567481 | 0.671032 |
| 11 | 13 | 4.283939 | 86.628487 | 0.671528 |
| 12 | 14 | 4.186630 | 86.932217 | 0.669594 |
| 13 | 15 | 3.902377 | 87.819460 | 0.541070 |
K = 7 # Change this value to experiment with different numbers of clusters
kmeans_wd = KMeans(n_clusters=K, random_state=10)
# Fitting the algorithm to the data
kmeans_wd = kmeans_wd.fit(wd_counts_scaled)
# Getting the Cluster labels for the dataframe
labels = kmeans_wd.predict(wd_counts_scaled)
clust_id_col_name='Cluster id'
wd[clust_id_col_name]=labels
# Gettinc cluster sizes
cluster_sizes = wd['Cluster id'].value_counts().rename_axis('Cluster id').to_frame('counts')
print("Cluster sizes:")
print(cluster_sizes)
max_cluster_size=cluster_sizes['counts'].max()
print("max = ",max_cluster_size)
# Getting the Centroid values
centroids = kmeans_wd.cluster_centers_
print("\nCluster centroids:")
print(centroids)
Cluster sizes:
counts
Cluster id
0 417
3 56
1 20
6 11
2 7
5 5
4 5
max = 417
Cluster centroids:
[[0.00785578 0.01100658 0.00514219 0.00852651 0.01514083]
[0.4137931 0.22435897 0.24010067 0.09444444 0.03823529]
[0.11330049 0.12454212 0.08628955 0.00793651 0.767507 ]
[0.0929803 0.19688645 0.03673298 0.04166667 0.01015406]
[0.26896552 0.93333333 0.16040268 0.06666667 0.18431373]
[0.31724138 0.31794872 0.87315436 0.35555556 0.0627451 ]
[0.21525601 0.23543124 0.22940818 0.54040404 0.03030303]]
Generate colours for the clusters by means of spatialisation¶
The spatialisation (projection) is applied to the cluster centroids.
mds_clusters = MDS(n_components = 2, random_state=9)
mds_clusters.fit(centroids)
xy_mds = mds_clusters.fit_transform(centroids)
xmin=xy_mds[:,0].min()
xmax=xy_mds[:,0].max()
ymin=xy_mds[:,1].min()
ymax=xy_mds[:,1].max()
print(xmin,xmax,ymin,ymax)
plt.figure(figsize=(5,5))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('MDS projection of cluster centroids')
cluster_colors = []
for i in range(len(xy_mds)):
color=getColor(xy_mds[i,0], xy_mds[i,1],xmin,xmax,ymin,ymax)
cluster_colors.append(color)
j=np.where(cluster_sizes.index==i)[0][0]
r=cluster_sizes.iat[j,0]/max_cluster_size
size=50 + 300*r
plt.scatter(xy_mds[i,0], xy_mds[i,1], alpha = .9, c = color, s = size)
plt.text(xy_mds[i,0]+0.0001*size, xy_mds[i,1]+0.0001*size, str(i)+": "+str(cluster_sizes.iat[j,0]), alpha = .5)
plt.grid()
-0.599243971364135 0.3942002420239995 -0.6109511963084189 0.6828597996523946
Attach the cluster colours to the data records¶
data_colors=[]
for id,row in wd.iterrows():
cluster_id = row[clust_id_col_name]
data_colors.append(cluster_colors[cluster_id])
wd['Color']=data_colors
wd.head(10)
| id | Name | date | year | week of year | NW | SW | NE | GL | SE | Cluster id | Color | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | ||||||||||||
| 01/01/2007 | 01/01/2007 | 01/01/2007 | 01/01/2007 | 2007 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | #34cad3 |
| 08/01/2007 | 08/01/2007 | 08/01/2007 | 08/01/2007 | 2007 | 2 | 0 | 0 | 1 | 0 | 1 | 0 | #34cad3 |
| 15/01/2007 | 15/01/2007 | 15/01/2007 | 15/01/2007 | 2007 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | #34cad3 |
| 22/01/2007 | 22/01/2007 | 22/01/2007 | 22/01/2007 | 2007 | 4 | 2 | 0 | 0 | 0 | 1 | 0 | #34cad3 |
| 29/01/2007 | 29/01/2007 | 29/01/2007 | 29/01/2007 | 2007 | 5 | 0 | 1 | 1 | 1 | 0 | 0 | #34cad3 |
| 05/02/2007 | 05/02/2007 | 05/02/2007 | 05/02/2007 | 2007 | 6 | 0 | 2 | 0 | 1 | 0 | 0 | #34cad3 |
| 12/02/2007 | 12/02/2007 | 12/02/2007 | 12/02/2007 | 2007 | 7 | 0 | 6 | 0 | 0 | 0 | 3 | #4fb8bb |
| 19/02/2007 | 19/02/2007 | 19/02/2007 | 19/02/2007 | 2007 | 8 | 0 | 8 | 0 | 0 | 0 | 3 | #4fb8bb |
| 26/02/2007 | 26/02/2007 | 26/02/2007 | 26/02/2007 | 2007 | 9 | 2 | 3 | 1 | 0 | 0 | 0 | #34cad3 |
| 05/03/2007 | 05/03/2007 | 05/03/2007 | 05/03/2007 | 2007 | 10 | 9 | 8 | 0 | 0 | 0 | 3 | #4fb8bb |
Visualise the temporal distribution of the cluster membership¶
To account for the cyclic organisation of the time in our data, we create a 2D plot where one dimension corresponds to the yearly cycle (week of a year) and the other dimension to the sequence of the years. Each weekly interval is represented by a mark painted in the colour of the cluster this interval belongs to.
plt.figure(figsize=(15,3))
plt.xlabel('Week')
plt.ylabel('Year')
plt.title('Calendar')
colors = [(0,0,0)]
for id,row in wd.iterrows():
plt.scatter(row['week of year'],row['year'],alpha = 1, s=150, marker='s',c=row['Color'])
plt.grid()
plt.show()
Task: observe and characterise patterns in the temporal distribution¶
Visualise summary characteristics of the clusters in terms of the region-wise event counts¶
for i in range(len(counts_columns)):
boxes=[]
for j in range(len(centroids)):
box = go.Box(
y=wd.loc[wd[clust_id_col_name] == j][counts_columns[i]],
name = j,
marker = dict(
color = cluster_colors[j],
)
)
boxes.append(box)
layout = go.Layout(
title = "Statistics of counts by the clusters for region "+counts_columns[i]
)
fig = go.Figure(data=boxes,layout=layout)
iplot(fig, filename = "Statistics of counts by the clusters for region "+counts_columns[i])
Task 1: Interpret the clusters in terms of the regions having high amounts of photo taking activities.¶
Task 2: Compare the times of the highest photo taking activities in the different regions across the years.¶
For the second task, you need to combine information obtained from the statistical displays with the calendar representation of the temporal distribution cluster membership.